import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px
pd.options.plotting.backend = 'plotly'
# from dsc80_utils import * # Feel free to uncomment and use this.
Step 1: Introduction¶
TODO¶
Step 1: Introduction¶
Dataset Chosen: League of Legends 2022 Esports Match Data (Oracle’s Elixir)
What is this dataset?
This dataset contains post-game statistics from over 10,000 professional League of Legends matches. Each match includes information on individual players, teams, and game outcomes.
What question will I explore?
Which role “carries” the team more often: ADCs (Bot lane) or Mid laners?
Why is this question important?
In professional League of Legends, each role contributes differently to team success. Understanding which role is most likely to carry can help teams optimize draft strategies, training focus, and viewer insights.
How big is the dataset?
This dataset contains approximately df.shape[0] rows. Each game has about 12 rows: 10 for players (5 on each team) and 2 for team summary stats.
Which columns are relevant to this question?
position: player's role in the match (e.g., top, jng, mid, bot, sup)kills,deaths,assists: core stats used to evaluate performancedpm: damage per minute, a key indicator of contributionteamname: name of the team the player belongs toside: whether the team played on blue or red side
Initial plan: I will first clean the data to remove team-level rows, then focus on MID and BOT roles, and compare their contribution using performance metrics.
Step 2: Data Cleaning and Exploratory Data Analysis¶
import pandas as pd
import plotly.express as px
# 读取数据(关闭分块警告)
df = pd.read_csv('data/2022_LoL_esports_match_data_from_OraclesElixir.csv', low_memory=False)
# 查看前几行
df.head()
| gameid | datacompleteness | url | league | year | split | playoffs | date | game | patch | ... | opp_csat25 | golddiffat25 | xpdiffat25 | csdiffat25 | killsat25 | assistsat25 | deathsat25 | opp_killsat25 | opp_assistsat25 | opp_deathsat25 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | ESPORTSTMNT01_2690210 | complete | NaN | LCKC | 2022 | Spring | 0 | 2022-01-10 07:44:08 | 1 | 12.01 | ... | 203.0 | 605.0 | -525.0 | 9.0 | 0.0 | 1.0 | 1.0 | 0.0 | 2.0 | 0.0 |
| 1 | ESPORTSTMNT01_2690210 | complete | NaN | LCKC | 2022 | Spring | 0 | 2022-01-10 07:44:08 | 1 | 12.01 | ... | 163.0 | 421.0 | -903.0 | -28.0 | 2.0 | 4.0 | 2.0 | 1.0 | 5.0 | 1.0 |
| 2 | ESPORTSTMNT01_2690210 | complete | NaN | LCKC | 2022 | Spring | 0 | 2022-01-10 07:44:08 | 1 | 12.01 | ... | 187.0 | -149.0 | -224.0 | -5.0 | 1.0 | 3.0 | 0.0 | 3.0 | 4.0 | 3.0 |
| 3 | ESPORTSTMNT01_2690210 | complete | NaN | LCKC | 2022 | Spring | 0 | 2022-01-10 07:44:08 | 1 | 12.01 | ... | 284.0 | -1288.0 | -2005.0 | -85.0 | 2.0 | 1.0 | 2.0 | 3.0 | 4.0 | 0.0 |
| 4 | ESPORTSTMNT01_2690210 | complete | NaN | LCKC | 2022 | Spring | 0 | 2022-01-10 07:44:08 | 1 | 12.01 | ... | 27.0 | 499.0 | -314.0 | 12.0 | 1.0 | 3.0 | 2.0 | 0.0 | 7.0 | 2.0 |
5 rows × 161 columns
Data Cleaning¶
We remove the team-level rows by keeping only rows where position is one of the five standard roles: top, jng, mid, bot, and sup.
We also drop rows with missing values in key statistics such as kills, deaths, assists, or dpm.
df_players = df[df['position'].isin(['top', 'jng', 'mid', 'bot', 'sup'])].copy()
relevant_columns = ['gameid', 'playername', 'position', 'teamname', 'side',
'kills', 'deaths', 'assists', 'dpm']
df_players = df_players[relevant_columns]
df_players = df_players.dropna(subset=['kills', 'deaths', 'assists', 'dpm'])
df_players.head()
| gameid | playername | position | teamname | side | kills | deaths | assists | dpm | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | ESPORTSTMNT01_2690210 | Soboro | top | BRION Challengers | Blue | 2 | 3 | 2 | 552.2942 |
| 1 | ESPORTSTMNT01_2690210 | Raptor | jng | BRION Challengers | Blue | 2 | 5 | 6 | 412.0841 |
| 2 | ESPORTSTMNT01_2690210 | Feisty | mid | BRION Challengers | Blue | 2 | 2 | 3 | 499.4046 |
| 3 | ESPORTSTMNT01_2690210 | Gamin | bot | BRION Challengers | Blue | 2 | 4 | 2 | 389.0018 |
| 4 | ESPORTSTMNT01_2690210 | Loopy | sup | BRION Challengers | Blue | 1 | 5 | 6 | 128.3012 |
Univariate Analysis¶
We look at the distribution of positions, DPM (damage per minute), and kills.
px.histogram(df_players, x='position', title='Distribution of Player Positions')
Most players are evenly distributed across the five roles.
px.histogram(df_players, x='dpm', nbins=40, title='Distribution of Damage Per Minute (DPM)')
Most players have a DPM between 300 and 600, with a few extreme outliers over 1000.
Bivariate Analysis¶
We now explore how player position relates to performance metrics like DPM (damage per minute) and KDA.
px.box(df_players, x='position', y='dpm', title='DPM by Position')
The boxplot shows that Mid and Bot players generally have higher DPM than other roles, with Mid laners having the highest median and more upper-range outliers. This supports the hypothesis that Mid or Bot may carry more often.
# 计算 KDA(避免除以 0,给 deaths 加 1)
df_players['kda'] = (df_players['kills'] + df_players['assists']) / (df_players['deaths'] + 1)
# 绘制箱线图
px.box(df_players, x='position', y='kda', title='KDA by Position')
According to the KDA boxplot, Bot and Mid roles again show higher median and upper quartile values, suggesting that they often contribute more significantly to team kills and assists with fewer deaths.
Interesting Aggregates¶
We now group the data by position and compare average performance metrics to further explore which role tends to "carry" more often.
df_carry = df_players[df_players['position'].isin(['mid', 'bot'])].copy()
df_carry['kda'] = (df_carry['kills'] + df_carry['assists']) / (df_carry['deaths'] + 1)
agg_df = df_carry.groupby('position')[['kills', 'deaths', 'assists', 'dpm', 'kda']].mean().round(2)
agg_df
| kills | deaths | assists | dpm | kda | |
|---|---|---|---|---|---|
| position | |||||
| bot | 4.26 | 2.55 | 5.37 | 560.88 | 4.11 |
| mid | 3.50 | 2.66 | 5.89 | 546.30 | 3.91 |
The aggregate table shows that Bot laners have slightly higher average DPM, kills, and KDA compared to Mid laners, while Mid laners provide slightly more assists. This suggests that Bot players contribute more direct damage and kills while also dying less often, supporting the idea that Bot may be more likely to carry on average.
Step 3: Assessment of Missingness¶
NMAR Analysis¶
We reason about whether any column in the dataset may be Not Missing At Random (NMAR). This analysis is theoretical and does not involve writing code.
One candidate for NMAR could be the column dpm (damage per minute). If players perform extremely poorly (e.g., disconnect, AFK), it is possible that no damage data is recorded, leading to missing values. In that case, the value is missing because of what the value would have been (low or zero), satisfying the NMAR definition.
To confirm this, we would need access to game logs or developer notes explaining when and how damage stats are recorded. If those logs suggest damage is only recorded for players who actively participated in the match, then this would be a strong NMAR case. Otherwise, we may conclude it is MAR.
Note: While we hypothesize that DPM may be NMAR in reality, our dataset contains very few missing values. For the purpose of Step 3.2 (Missingness Dependency), we simulate MCAR missingness in DPM to practice permutation tests.
Missingness Dependency¶
We investigate whether the missingness of a selected column depends on the values in other columns, using permutation tests.
df_players.isna().mean().sort_values(ascending=False)
teamname 0.000359 playername 0.000128 gameid 0.000000 position 0.000000 side 0.000000 kills 0.000000 deaths 0.000000 assists 0.000000 dpm 0.000000 dtype: float64
Since the cleaned dataset contains very few missing values, we simulate missingness in the dpm column for the purpose of practicing a permutation test. We randomly introduce missing values to 30% of the rows in dpm, creating a Missing Completely At Random (MCAR) scenario.
# 设置随机种子,保证复现
np.random.seed(42)
# 构造 30% 的缺失值(MCAR)
df_players_sim = df_players.copy()
missing_idx = df_players.sample(frac=0.3, random_state=42).index
df_players_sim.loc[missing_idx, 'dpm'] = np.nan
# 新建一列:是否缺失
df_players_sim['dpm_missing'] = df_players_sim['dpm'].isna()
# 看看缺失比例
df_players_sim['dpm_missing'].mean()
np.float64(0.3)
We now examine whether the missingness of dpm (damage per minute) depends on other variables. We use permutation tests to check for dependency on position (expected to be dependent) and teamname (expected to be independent).
px.histogram(df_players_sim, x='position', color='dpm_missing', barmode='overlay',
title='DPM Missingness by Position')
def permutation_test(df, group_col, outcome_col, statistic_fn, n_permutations=1000):
observed = statistic_fn(df, group_col, outcome_col)
stats = []
for _ in range(n_permutations):
shuffled = df[group_col].sample(frac=1, replace=False).reset_index(drop=True)
stat = statistic_fn(df.assign(shuffled=shuffled), 'shuffled', outcome_col)
stats.append(stat)
return observed, stats
def stat_fn(df, group_col, outcome_col):
# 各组缺失率最大差
return df.groupby(group_col)[outcome_col].mean().max() - df.groupby(group_col)[outcome_col].mean().min()
observed_pos, stats_pos = permutation_test(df_players_sim, 'position', 'dpm_missing', stat_fn)
p_value_pos = np.mean(np.array(stats_pos) >= observed_pos)
print(f'Observed stat (position): {observed_pos:.4f}')
print(f'p-value: {p_value_pos:.4f}')
Observed stat (position): 0.0022 p-value: 0.9890
The permutation test between dpm_missing and position yielded a p-value of 0.9890.
This very high p-value suggests that the missingness of dpm is likely independent of the player's position.
The small observed difference (0.0022) could easily have occurred by chance.
observed_team, stats_team = permutation_test(df_players_sim, 'teamname', 'dpm_missing', stat_fn)
p_value_team = np.mean(np.array(stats_team) >= observed_team)
print(f'Observed stat (teamname): {observed_team:.4f}')
print(f'p-value: {p_value_team:.4f}')
Observed stat (teamname): 0.7000 p-value: 0.3450
The permutation test for teamname resulted in a p-value of 0.3450.
Although the observed statistic (0.7000) appears relatively large, the p-value indicates that this difference could reasonably have occurred by chance.
Therefore, the missingness in dpm is likely independent of team affiliation.
Step 4: Hypothesis Testing¶
Step 4: Hypothesis Testing¶
We aim to test whether Mid and Bot players have significantly different performance using the KDA metric.
We define the null and alternative hypotheses as:
- Null Hypothesis (H₀): The distribution of KDA is the same for Mid and Bot players. Any observed difference is due to random variation.
- Alternative Hypothesis (H₁): The distribution of KDA for Mid and Bot players is different.
We will perform a permutation test using the difference in mean KDA between the two roles as the test statistic.
We use a significance level of α = 0.05.
# 只保留 mid 和 bot,确保 KDA 已存在
df_test = df_players[df_players['position'].isin(['mid', 'bot'])].copy()
df_test['kda'] = (df_test['kills'] + df_test['assists']) / (df_test['deaths'] + 1)
def permutation_diff_of_means(df, group_col, val_col, group1, group2, n_permutations=1000):
observed = df[df[group_col] == group1][val_col].mean() - df[df[group_col] == group2][val_col].mean()
stats = []
for _ in range(n_permutations):
shuffled = df[group_col].sample(frac=1, replace=False).reset_index(drop=True)
shuffled_df = df.copy()
shuffled_df[group_col] = shuffled
stat = shuffled_df[shuffled_df[group_col] == group1][val_col].mean() - \
shuffled_df[shuffled_df[group_col] == group2][val_col].mean()
stats.append(stat)
p_val = np.mean(np.abs(stats) >= np.abs(observed))
return observed, stats, p_val
observed, stats, p_val = permutation_diff_of_means(df_test, 'position', 'kda', 'mid', 'bot')
print(f'Observed difference in KDA (mid - bot): {observed:.4f}')
print(f'p-value: {p_val:.4f}')
Observed difference in KDA (mid - bot): -0.1980 p-value: 0.0000
import plotly.express as px
px.histogram(x=stats, nbins=50, title='Permutation Distribution of Difference in Mean KDA',
labels={'x': 'Difference in Mean KDA'}).add_vline(x=observed, line_color='red')
The observed difference in mean KDA between Mid and Bot players is -0.1980, with a p-value of 0.0000.
Since the p-value is less than 0.05, we reject the null hypothesis.
This suggests that the KDA performance of Mid and Bot players is statistically significantly different, under the assumptions of this test. Specifically, Bot players tend to have higher KDA than Mid players on average.
Note: This result does not prove causation and is subject to the limitations of observational data and the simulation-based testing approach.
Step 5: Framing a Prediction Problem¶
Step 5: Framing a Prediction Problem¶
Prediction Task:
Given a player's post-game performance statistics, predict whether they played the Mid or Bot position.
Prediction Type:
This is a binary classification problem.
Response Variable:
The response variable is position, limited to only the values 'mid' and 'bot'.
Why this problem?
This problem is closely related to our overall project goal of comparing Mid and Bot players. If a model can reliably predict position based on performance, that suggests these roles have distinct statistical profiles.
Features Used:
We will use features that are known immediately after a match ends:
killsdeathsassistsdpm(damage per minute)
These features are appropriate because they would all be known at the time of prediction.
Evaluation Metric:
We will use accuracy as our primary evaluation metric, as the two classes (mid, bot) are approximately balanced in size, and the cost of a misclassification is symmetric.
Step 6: Baseline Model¶
Step 6: Baseline Model¶
We build a baseline classification model to predict whether a player is a Mid or Bot laner based on their post-game performance metrics. We use a logistic regression model with a basic preprocessing pipeline.
We use the following quantitative features:
kills(number of kills)deaths(number of times the player died)assists(number of assists)dpm(damage per minute)
All features are numeric and require no encoding. The target variable is position, restricted to 'mid' and 'bot'.
from sklearn.model_selection import train_test_split
# 只保留 mid 和 bot
df_model = df_players[df_players['position'].isin(['mid', 'bot'])].copy()
# 特征和标签
X = df_model[['kills', 'deaths', 'assists', 'dpm']]
y = df_model['position']
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
# baseline pipeline: scale + logistic regression
baseline_model = make_pipeline(
StandardScaler(),
LogisticRegression(max_iter=1000)
)
# 训练模型
baseline_model.fit(X_train, y_train)
Pipeline(steps=[('standardscaler', StandardScaler()),
('logisticregression', LogisticRegression(max_iter=1000))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('standardscaler', StandardScaler()),
('logisticregression', LogisticRegression(max_iter=1000))])StandardScaler()
LogisticRegression(max_iter=1000)
from sklearn.metrics import accuracy_score, classification_report
# 预测与评估
y_pred = baseline_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Accuracy: 0.5695786432911645
Classification Report:
precision recall f1-score support
bot 0.57 0.54 0.55 5020
mid 0.57 0.60 0.58 5019
accuracy 0.57 10039
macro avg 0.57 0.57 0.57 10039
weighted avg 0.57 0.57 0.57 10039
The baseline model is a logistic regression classifier using four quantitative features. No categorical encoding was needed.
The model achieved an accuracy of approximately 56.96% on the test set, with similar precision and recall for both classes.
Given that this is a simple baseline model without any feature engineering or tuning, the performance is reasonable for a baseline. In the next step, we will attempt to improve performance by building a more advanced model with additional preprocessing or feature selection.
Step 7: Final Model¶
Step 7: Final Model¶
We aim to improve upon the baseline model by engineering additional features and tuning a more complex model. We use a Random Forest Classifier, and perform grid search over its hyperparameters.
Feature Engineering¶
We apply the following transformations:
killsanddeathsare passed through a StandardScaler, as their distribution is relatively normal and benefits from centering.assistsis passed through a QuantileTransformer, as it is heavily right-skewed and may benefit from normalization to a uniform distribution.
This creates two newly transformed features that better represent the original data and are more suitable for our model.
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
# 原始特征列
numeric_features = ['kills', 'deaths', 'assists', 'dpm']
# 设置变换器
transformer = ColumnTransformer([
('scale_kill_death', StandardScaler(), ['kills', 'deaths']),
('quant_assist', QuantileTransformer(), ['assists']),
('passthrough_dpm', 'passthrough', ['dpm'])
])
# Pipeline + GridSearchCV
pipe = Pipeline([
('transform', transformer),
('clf', RandomForestClassifier(random_state=42))
])
# 超参数网格
param_grid = {
'clf__n_estimators': [50, 100],
'clf__max_depth': [5, 10, None]
}
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)
GridSearchCV(cv=5,
estimator=Pipeline(steps=[('transform',
ColumnTransformer(transformers=[('scale_kill_death',
StandardScaler(),
['kills',
'deaths']),
('quant_assist',
QuantileTransformer(),
['assists']),
('passthrough_dpm',
'passthrough',
['dpm'])])),
('clf',
RandomForestClassifier(random_state=42))]),
param_grid={'clf__max_depth': [5, 10, None],
'clf__n_estimators': [50, 100]},
scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5,
estimator=Pipeline(steps=[('transform',
ColumnTransformer(transformers=[('scale_kill_death',
StandardScaler(),
['kills',
'deaths']),
('quant_assist',
QuantileTransformer(),
['assists']),
('passthrough_dpm',
'passthrough',
['dpm'])])),
('clf',
RandomForestClassifier(random_state=42))]),
param_grid={'clf__max_depth': [5, 10, None],
'clf__n_estimators': [50, 100]},
scoring='accuracy')Pipeline(steps=[('transform',
ColumnTransformer(transformers=[('scale_kill_death',
StandardScaler(),
['kills', 'deaths']),
('quant_assist',
QuantileTransformer(),
['assists']),
('passthrough_dpm',
'passthrough', ['dpm'])])),
('clf', RandomForestClassifier(max_depth=10, random_state=42))])ColumnTransformer(transformers=[('scale_kill_death', StandardScaler(),
['kills', 'deaths']),
('quant_assist', QuantileTransformer(),
['assists']),
('passthrough_dpm', 'passthrough', ['dpm'])])['kills', 'deaths']
StandardScaler()
['assists']
QuantileTransformer()
['dpm']
passthrough
RandomForestClassifier(max_depth=10, random_state=42)
from sklearn.metrics import accuracy_score, classification_report
y_pred_final = grid.predict(X_test)
print("Best Parameters:", grid.best_params_)
print("Final Accuracy:", accuracy_score(y_test, y_pred_final))
print("\nFinal Classification Report:\n", classification_report(y_test, y_pred_final))
Best Parameters: {'clf__max_depth': 10, 'clf__n_estimators': 100}
Final Accuracy: 0.5676860245044327
Final Classification Report:
precision recall f1-score support
bot 0.57 0.56 0.56 5020
mid 0.57 0.58 0.57 5019
accuracy 0.57 10039
macro avg 0.57 0.57 0.57 10039
weighted avg 0.57 0.57 0.57 10039
We used a Random Forest classifier with grid search to tune max_depth and n_estimators. The best parameters found were: max_depth = 10, n_estimators = 100.
Our final model achieved an accuracy of 56.77% on the test set, compared to the baseline model’s accuracy of 56.96%.
Although the final model did not outperform the baseline in this case, the experiment demonstrates the value of feature engineering and model complexity tuning. It's possible that the performance plateau is due to limited signal in the selected features.
This model uses:
- Two newly transformed features (
kills,deathsvia StandardScaler, andassistsvia QuantileTransformer) - One raw numerical feature (
dpm) - A tree-based classifier that can model non-linear relationships
All preprocessing steps and modeling are encapsulated within a single sklearn Pipeline, and the evaluation was performed on the same test set as the baseline model.
Step 8: Fairness Analysis¶
Step 8: Fairness Analysis¶
We examine whether our final model performs equally well for players of different positions – specifically, whether it is fair to Mid and Bot laners.
We define:
- Group X: Mid laners
- Group Y: Bot laners
Evaluation Metric:
We use precision, which reflects the proportion of predicted positives that are correct. Since we are predicting role (Mid or Bot), we ask whether our model is equally precise in predicting one role versus the other.
Hypotheses:
- Null Hypothesis (H₀): The model is fair. The precision for Mid and Bot players is the same.
- Alternative Hypothesis (H₁): The model is unfair. The precision differs significantly between the two groups.
We will perform a permutation test to assess the difference in precision.
from sklearn.metrics import precision_score
# 确保你有模型预测和标签
# y_test: 原始标签
# y_pred_final: 预测结果
# 精确率:分别对 mid 和 bot 做
mask_mid = y_test == 'mid'
mask_bot = y_test == 'bot'
precision_mid = precision_score(y_test[mask_mid], y_pred_final[mask_mid], pos_label='mid')
precision_bot = precision_score(y_test[mask_bot], y_pred_final[mask_bot], pos_label='bot')
observed_diff = abs(precision_mid - precision_bot)
print("Observed difference in precision (mid - bot):", observed_diff)
Observed difference in precision (mid - bot): 0.0
def permute_precision_difference(y_true, y_pred, group_labels, group_a, group_b, n=1000):
observed = abs(
precision_score(y_true[group_labels == group_a], y_pred[group_labels == group_a], pos_label=group_a) -
precision_score(y_true[group_labels == group_b], y_pred[group_labels == group_b], pos_label=group_b)
)
stats = []
for _ in range(n):
shuffled = group_labels.sample(frac=1, replace=False).reset_index(drop=True)
stat = abs(
precision_score(y_true[shuffled == group_a], y_pred[shuffled == group_a], pos_label=group_a) -
precision_score(y_true[shuffled == group_b], y_pred[shuffled == group_b], pos_label=group_b)
)
stats.append(stat)
p_val = np.mean(np.array(stats) >= observed)
return observed, stats, p_val
observed, stats, p_val = permute_precision_difference(
y_test.reset_index(drop=True),
pd.Series(y_pred_final).reset_index(drop=True),
y_test.reset_index(drop=True),
'mid',
'bot'
)
print(f'Observed Precision Difference: {observed:.4f}')
print(f'p-value: {p_val:.4f}')
Observed Precision Difference: 0.0000 p-value: 1.0000
The observed difference in precision between Mid and Bot players was 0.0000, with a p-value of 1.0000.
Since the p-value is greater than 0.05, we fail to reject the null hypothesis.
This suggests that the model's performance is fair with respect to role-based classification. Specifically, the model appears to be equally precise when predicting both Mid and Bot players.